Week 2 Lab Session - Data Viz

# OUTLINE: 
# 1. Data visualization
# 2. Data wrangling and the Tidyverse (dplyer)
# 3. Look over Assignment 1

# Load packages
library(tidyverse) #ggplot, readr, dplyer, etc.
library(haven) #opening Stata datasets
library(skimr) #descriptive statistics
library(nycflights13) #load datasets from moderndive book

# These we'll need to install
#install.packages("scales")
library(scales) #for labeling axes with %, $, etc.
#install.packages("ggridges")
library(ggridges)
#install.packages("moderndive")
library(moderndive)
#install.packages("ggrepel")
library(ggrepel)
#install.packages("questionr")
library(questionr) #for frequency tables


# Load data
nes <- read_dta("nes.dta")
states <- read_dta("states.dta")
vdem <- read_dta("vdem.dta")

# Calling up variables in datasets: $ method, "data$variable"
skim(vdem$v2x_polyarchy)
Data summary
Name vdem$v2x_polyarchy
Number of rows 179
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.52 0.25 0.02 0.29 0.52 0.75 0.91 ▃▆▆▆▇
# For skimr, you can also use use (data, variable)
skim(vdem, v2x_polyarchy)
Data summary
Name vdem
Number of rows 179
Number of columns 4171
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
v2x_polyarchy 0 1 0.52 0.25 0.02 0.29 0.52 0.75 0.91 ▃▆▆▆▇
# Frequency table, using questionr package: 
freq(vdem$v2x_regime)
                         n    % val%
[0] Closed Autocracy    25 14.0 14.0
[1] Electoral Autocracy 64 35.8 35.8
[2] Electoral Democracy 54 30.2 30.2
[3] Liberal Democracy   36 20.1 20.1
###################################################################################### 
################################## Histograms   ################################## 
###################################################################################### 

# We'll use the "weather" data from nycflights13 package; 
# Hourly meteorological data for LGA, JFK and EWR.
# We want to visualize the distribution of a single variable with a histogram; 
# these are ideal for "continuous" variables with lots of values.

weather <- weather
#?weather

# We'll focus on temperatures ("temp") at these three airports
# Note: geometry for a histogram is "geom_histogram"
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram() 

# Too clumpy; let's delineate the bars some more. Always use color = "white" option.
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(color = "white")

# We can also adjust the "bins" or the width of the bars; more or less fine-grained
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(bins = 40, color = "white")

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 10, color = "white")

# Rules for binwidth? The Freedman-Diaconis rule:
# binwwidth <- 2 * IQR / (N^(1/3))
skim(weather, temp)
Data summary
Name weather
Number of rows 26115
Number of columns 15
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
temp 1 1 55.26 17.79 10.94 39.92 55.4 69.98 100.04 ▂▇▇▇▁
bw <- (2 * (70.0-39.9)) / (26115^(1/3))

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = bw, color = "white")

# Facet wraps: These are very cool for subsetting by some grouping

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 5, color = "white") +
  facet_wrap(~month)

# We can also report y-axis in percentage terms; the third line uses 
# the "scales" package
# also, label axes, give title
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, 
  color = "white") +
  scale_y_continuous(labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures")

# Now change labeling for y-axis values
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, 
  color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures")

# Change theme for graph background
# Now change labeling for y-axis values
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, 
                 color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +
  theme_minimal()

# Center title
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, 
                 color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

########### NOTE ON SYNTAX ###############

# There are some aspects of the syntax used above that are unnecessary and that we can
# eliminate. For example: 

# This command: 
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram()

# is identical to this commmand (note that I've removed "data=" and "mapping="): 
ggplot(weather, aes(x = temp)) +
  geom_histogram()

# You actually don't even need "x=" but I like to include that to make it clear.
ggplot(weather, aes(temp)) +
  geom_histogram()

# From now on, we'll simplify to the second command

###################################################################################### 
################################## Density Plots   ################################## 
###################################################################################### 

ggplot(weather, aes(x = temp)) +
  geom_density() +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

# Fill with color
ggplot(weather, aes(x = temp)) +
  geom_density(fill="dodgerblue") +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

# Full list of colors:
colors()
  [1] "white"                "aliceblue"            "antiquewhite"        
  [4] "antiquewhite1"        "antiquewhite2"        "antiquewhite3"       
  [7] "antiquewhite4"        "aquamarine"           "aquamarine1"         
 [10] "aquamarine2"          "aquamarine3"          "aquamarine4"         
 [13] "azure"                "azure1"               "azure2"              
 [16] "azure3"               "azure4"               "beige"               
 [19] "bisque"               "bisque1"              "bisque2"             
 [22] "bisque3"              "bisque4"              "black"               
 [25] "blanchedalmond"       "blue"                 "blue1"               
 [28] "blue2"                "blue3"                "blue4"               
 [31] "blueviolet"           "brown"                "brown1"              
 [34] "brown2"               "brown3"               "brown4"              
 [37] "burlywood"            "burlywood1"           "burlywood2"          
 [40] "burlywood3"           "burlywood4"           "cadetblue"           
 [43] "cadetblue1"           "cadetblue2"           "cadetblue3"          
 [46] "cadetblue4"           "chartreuse"           "chartreuse1"         
 [49] "chartreuse2"          "chartreuse3"          "chartreuse4"         
 [52] "chocolate"            "chocolate1"           "chocolate2"          
 [55] "chocolate3"           "chocolate4"           "coral"               
 [58] "coral1"               "coral2"               "coral3"              
 [61] "coral4"               "cornflowerblue"       "cornsilk"            
 [64] "cornsilk1"            "cornsilk2"            "cornsilk3"           
 [67] "cornsilk4"            "cyan"                 "cyan1"               
 [70] "cyan2"                "cyan3"                "cyan4"               
 [73] "darkblue"             "darkcyan"             "darkgoldenrod"       
 [76] "darkgoldenrod1"       "darkgoldenrod2"       "darkgoldenrod3"      
 [79] "darkgoldenrod4"       "darkgray"             "darkgreen"           
 [82] "darkgrey"             "darkkhaki"            "darkmagenta"         
 [85] "darkolivegreen"       "darkolivegreen1"      "darkolivegreen2"     
 [88] "darkolivegreen3"      "darkolivegreen4"      "darkorange"          
 [91] "darkorange1"          "darkorange2"          "darkorange3"         
 [94] "darkorange4"          "darkorchid"           "darkorchid1"         
 [97] "darkorchid2"          "darkorchid3"          "darkorchid4"         
[100] "darkred"              "darksalmon"           "darkseagreen"        
[103] "darkseagreen1"        "darkseagreen2"        "darkseagreen3"       
[106] "darkseagreen4"        "darkslateblue"        "darkslategray"       
[109] "darkslategray1"       "darkslategray2"       "darkslategray3"      
[112] "darkslategray4"       "darkslategrey"        "darkturquoise"       
[115] "darkviolet"           "deeppink"             "deeppink1"           
[118] "deeppink2"            "deeppink3"            "deeppink4"           
[121] "deepskyblue"          "deepskyblue1"         "deepskyblue2"        
[124] "deepskyblue3"         "deepskyblue4"         "dimgray"             
[127] "dimgrey"              "dodgerblue"           "dodgerblue1"         
[130] "dodgerblue2"          "dodgerblue3"          "dodgerblue4"         
[133] "firebrick"            "firebrick1"           "firebrick2"          
[136] "firebrick3"           "firebrick4"           "floralwhite"         
[139] "forestgreen"          "gainsboro"            "ghostwhite"          
[142] "gold"                 "gold1"                "gold2"               
[145] "gold3"                "gold4"                "goldenrod"           
[148] "goldenrod1"           "goldenrod2"           "goldenrod3"          
[151] "goldenrod4"           "gray"                 "gray0"               
[154] "gray1"                "gray2"                "gray3"               
[157] "gray4"                "gray5"                "gray6"               
[160] "gray7"                "gray8"                "gray9"               
[163] "gray10"               "gray11"               "gray12"              
[166] "gray13"               "gray14"               "gray15"              
[169] "gray16"               "gray17"               "gray18"              
[172] "gray19"               "gray20"               "gray21"              
[175] "gray22"               "gray23"               "gray24"              
[178] "gray25"               "gray26"               "gray27"              
[181] "gray28"               "gray29"               "gray30"              
[184] "gray31"               "gray32"               "gray33"              
[187] "gray34"               "gray35"               "gray36"              
[190] "gray37"               "gray38"               "gray39"              
[193] "gray40"               "gray41"               "gray42"              
[196] "gray43"               "gray44"               "gray45"              
[199] "gray46"               "gray47"               "gray48"              
[202] "gray49"               "gray50"               "gray51"              
[205] "gray52"               "gray53"               "gray54"              
[208] "gray55"               "gray56"               "gray57"              
[211] "gray58"               "gray59"               "gray60"              
[214] "gray61"               "gray62"               "gray63"              
[217] "gray64"               "gray65"               "gray66"              
[220] "gray67"               "gray68"               "gray69"              
[223] "gray70"               "gray71"               "gray72"              
[226] "gray73"               "gray74"               "gray75"              
[229] "gray76"               "gray77"               "gray78"              
[232] "gray79"               "gray80"               "gray81"              
[235] "gray82"               "gray83"               "gray84"              
[238] "gray85"               "gray86"               "gray87"              
[241] "gray88"               "gray89"               "gray90"              
[244] "gray91"               "gray92"               "gray93"              
[247] "gray94"               "gray95"               "gray96"              
[250] "gray97"               "gray98"               "gray99"              
[253] "gray100"              "green"                "green1"              
[256] "green2"               "green3"               "green4"              
[259] "greenyellow"          "grey"                 "grey0"               
[262] "grey1"                "grey2"                "grey3"               
[265] "grey4"                "grey5"                "grey6"               
[268] "grey7"                "grey8"                "grey9"               
[271] "grey10"               "grey11"               "grey12"              
[274] "grey13"               "grey14"               "grey15"              
[277] "grey16"               "grey17"               "grey18"              
[280] "grey19"               "grey20"               "grey21"              
[283] "grey22"               "grey23"               "grey24"              
[286] "grey25"               "grey26"               "grey27"              
[289] "grey28"               "grey29"               "grey30"              
[292] "grey31"               "grey32"               "grey33"              
[295] "grey34"               "grey35"               "grey36"              
[298] "grey37"               "grey38"               "grey39"              
[301] "grey40"               "grey41"               "grey42"              
[304] "grey43"               "grey44"               "grey45"              
[307] "grey46"               "grey47"               "grey48"              
[310] "grey49"               "grey50"               "grey51"              
[313] "grey52"               "grey53"               "grey54"              
[316] "grey55"               "grey56"               "grey57"              
[319] "grey58"               "grey59"               "grey60"              
[322] "grey61"               "grey62"               "grey63"              
[325] "grey64"               "grey65"               "grey66"              
[328] "grey67"               "grey68"               "grey69"              
[331] "grey70"               "grey71"               "grey72"              
[334] "grey73"               "grey74"               "grey75"              
[337] "grey76"               "grey77"               "grey78"              
[340] "grey79"               "grey80"               "grey81"              
[343] "grey82"               "grey83"               "grey84"              
[346] "grey85"               "grey86"               "grey87"              
[349] "grey88"               "grey89"               "grey90"              
[352] "grey91"               "grey92"               "grey93"              
[355] "grey94"               "grey95"               "grey96"              
[358] "grey97"               "grey98"               "grey99"              
[361] "grey100"              "honeydew"             "honeydew1"           
[364] "honeydew2"            "honeydew3"            "honeydew4"           
[367] "hotpink"              "hotpink1"             "hotpink2"            
[370] "hotpink3"             "hotpink4"             "indianred"           
[373] "indianred1"           "indianred2"           "indianred3"          
[376] "indianred4"           "ivory"                "ivory1"              
[379] "ivory2"               "ivory3"               "ivory4"              
[382] "khaki"                "khaki1"               "khaki2"              
[385] "khaki3"               "khaki4"               "lavender"            
[388] "lavenderblush"        "lavenderblush1"       "lavenderblush2"      
[391] "lavenderblush3"       "lavenderblush4"       "lawngreen"           
[394] "lemonchiffon"         "lemonchiffon1"        "lemonchiffon2"       
[397] "lemonchiffon3"        "lemonchiffon4"        "lightblue"           
[400] "lightblue1"           "lightblue2"           "lightblue3"          
[403] "lightblue4"           "lightcoral"           "lightcyan"           
[406] "lightcyan1"           "lightcyan2"           "lightcyan3"          
[409] "lightcyan4"           "lightgoldenrod"       "lightgoldenrod1"     
[412] "lightgoldenrod2"      "lightgoldenrod3"      "lightgoldenrod4"     
[415] "lightgoldenrodyellow" "lightgray"            "lightgreen"          
[418] "lightgrey"            "lightpink"            "lightpink1"          
[421] "lightpink2"           "lightpink3"           "lightpink4"          
[424] "lightsalmon"          "lightsalmon1"         "lightsalmon2"        
[427] "lightsalmon3"         "lightsalmon4"         "lightseagreen"       
[430] "lightskyblue"         "lightskyblue1"        "lightskyblue2"       
[433] "lightskyblue3"        "lightskyblue4"        "lightslateblue"      
[436] "lightslategray"       "lightslategrey"       "lightsteelblue"      
[439] "lightsteelblue1"      "lightsteelblue2"      "lightsteelblue3"     
[442] "lightsteelblue4"      "lightyellow"          "lightyellow1"        
[445] "lightyellow2"         "lightyellow3"         "lightyellow4"        
[448] "limegreen"            "linen"                "magenta"             
[451] "magenta1"             "magenta2"             "magenta3"            
[454] "magenta4"             "maroon"               "maroon1"             
[457] "maroon2"              "maroon3"              "maroon4"             
[460] "mediumaquamarine"     "mediumblue"           "mediumorchid"        
[463] "mediumorchid1"        "mediumorchid2"        "mediumorchid3"       
[466] "mediumorchid4"        "mediumpurple"         "mediumpurple1"       
[469] "mediumpurple2"        "mediumpurple3"        "mediumpurple4"       
[472] "mediumseagreen"       "mediumslateblue"      "mediumspringgreen"   
[475] "mediumturquoise"      "mediumvioletred"      "midnightblue"        
[478] "mintcream"            "mistyrose"            "mistyrose1"          
[481] "mistyrose2"           "mistyrose3"           "mistyrose4"          
[484] "moccasin"             "navajowhite"          "navajowhite1"        
[487] "navajowhite2"         "navajowhite3"         "navajowhite4"        
[490] "navy"                 "navyblue"             "oldlace"             
[493] "olivedrab"            "olivedrab1"           "olivedrab2"          
[496] "olivedrab3"           "olivedrab4"           "orange"              
[499] "orange1"              "orange2"              "orange3"             
[502] "orange4"              "orangered"            "orangered1"          
[505] "orangered2"           "orangered3"           "orangered4"          
[508] "orchid"               "orchid1"              "orchid2"             
[511] "orchid3"              "orchid4"              "palegoldenrod"       
[514] "palegreen"            "palegreen1"           "palegreen2"          
[517] "palegreen3"           "palegreen4"           "paleturquoise"       
[520] "paleturquoise1"       "paleturquoise2"       "paleturquoise3"      
[523] "paleturquoise4"       "palevioletred"        "palevioletred1"      
[526] "palevioletred2"       "palevioletred3"       "palevioletred4"      
[529] "papayawhip"           "peachpuff"            "peachpuff1"          
[532] "peachpuff2"           "peachpuff3"           "peachpuff4"          
[535] "peru"                 "pink"                 "pink1"               
[538] "pink2"                "pink3"                "pink4"               
[541] "plum"                 "plum1"                "plum2"               
[544] "plum3"                "plum4"                "powderblue"          
[547] "purple"               "purple1"              "purple2"             
[550] "purple3"              "purple4"              "red"                 
[553] "red1"                 "red2"                 "red3"                
[556] "red4"                 "rosybrown"            "rosybrown1"          
[559] "rosybrown2"           "rosybrown3"           "rosybrown4"          
[562] "royalblue"            "royalblue1"           "royalblue2"          
[565] "royalblue3"           "royalblue4"           "saddlebrown"         
[568] "salmon"               "salmon1"              "salmon2"             
[571] "salmon3"              "salmon4"              "sandybrown"          
[574] "seagreen"             "seagreen1"            "seagreen2"           
[577] "seagreen3"            "seagreen4"            "seashell"            
[580] "seashell1"            "seashell2"            "seashell3"           
[583] "seashell4"            "sienna"               "sienna1"             
[586] "sienna2"              "sienna3"              "sienna4"             
[589] "skyblue"              "skyblue1"             "skyblue2"            
[592] "skyblue3"             "skyblue4"             "slateblue"           
[595] "slateblue1"           "slateblue2"           "slateblue3"          
[598] "slateblue4"           "slategray"            "slategray1"          
[601] "slategray2"           "slategray3"           "slategray4"          
[604] "slategrey"            "snow"                 "snow1"               
[607] "snow2"                "snow3"                "snow4"               
[610] "springgreen"          "springgreen1"         "springgreen2"        
[613] "springgreen3"         "springgreen4"         "steelblue"           
[616] "steelblue1"           "steelblue2"           "steelblue3"          
[619] "steelblue4"           "tan"                  "tan1"                
[622] "tan2"                 "tan3"                 "tan4"                
[625] "thistle"              "thistle1"             "thistle2"            
[628] "thistle3"             "thistle4"             "tomato"              
[631] "tomato1"              "tomato2"              "tomato3"             
[634] "tomato4"              "turquoise"            "turquoise1"          
[637] "turquoise2"           "turquoise3"           "turquoise4"          
[640] "violet"               "violetred"            "violetred1"          
[643] "violetred2"           "violetred3"           "violetred4"          
[646] "wheat"                "wheat1"               "wheat2"              
[649] "wheat3"               "wheat4"               "whitesmoke"          
[652] "yellow"               "yellow1"              "yellow2"             
[655] "yellow3"              "yellow4"              "yellowgreen"         
# Make transparent with "alpha"
ggplot(weather, aes(x = temp)) +
  geom_density(fill="dodgerblue", alpha=.5) +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

# Ridge plots
# Notice I have to change month to "factor" variable. Most variables are "continuous"
# by default. I need to change this to "discrete" or ggplot can't graph it like 
# I want it to.
ggplot(weather, aes(x = temp, y=factor(month))) +
  geom_density_ridges(fill="dodgerblue", alpha=.5) +
  labs(x="Temperature", y="Month", title="Temperatures") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

###################################################################################### 
################################## Boxplots     ################################## 
###################################################################################### 

# Boxplots are another way to visualize the distribution of a single variable. These are 
# again ideal for "continuous" variables with lots of values.

# What do boxplots give us exactly? "Low", 25th pctile, 50th pctile (median), 75th pctile, 
# and "high"; and outliers.

# Let's again use the weather data: temperature 
# Geometry for boxplot is "geom_boxplot"
ggplot(weather, aes(y = temp)) +
  geom_boxplot() 

# Get descriptive statistics using skimr package
skim(weather, temp)
Data summary
Name weather
Number of rows 26115
Number of columns 15
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
temp 1 1 55.26 17.79 10.94 39.92 55.4 69.98 100.04 ▂▇▇▇▁
# Use more fine-grained y-axis labels
ggplot(weather, aes(y = temp)) +
  geom_boxplot() + 
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 5))

# Adjust width
ggplot(weather, aes(y = temp)) +
  geom_boxplot(width=.3) + xlim(-.2, .4) +
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 5))

# Now let's look at temp by month.
# We need to treat month as a "factor" or categorical variable instead of a continuous 
# variable. We'll talk more about this distinction later in the class.
ggplot(weather, aes(x = factor(month), y = temp)) +
  geom_boxplot() + 
  labs(x="Month", y="Temperature")

# Interpretation: "The 'whiskers' are set to extend out no more than 1.5 × IQR 
# (75th minus 25th) units away from either end of the boxes. We say 'no more than' 
# because the ends of the whiskers have to correspond to observed temperatures."

# Note that the dots outside the whiskers are "outliers." 

###################################################################################### 
################################## Barplots     ################################## 
###################################################################################### 

# Barplots are ideal for visualizing the spread of a "categorical" 
# or "discrete" variable with few values, i.e., ordinal or nominal variable.
# This is related to a frequency distribution.

# Let's use the flights data from nycflights13 to examine the frequency of flights by 
# carrier. 

flights <- flights

# Let's use the "carrier" variable.
#?flights

# We could get a frequency distribution in table form using the 
# "freq" command from the questionr package.
freq(flights$carrier)
       n    % val%
9E 18460  5.5  5.5
AA 32729  9.7  9.7
AS   714  0.2  0.2
B6 54635 16.2 16.2
DL 48110 14.3 14.3
EV 54173 16.1 16.1
F9   685  0.2  0.2
FL  3260  1.0  1.0
HA   342  0.1  0.1
MQ 26397  7.8  7.8
OO    32  0.0  0.0
UA 58665 17.4 17.4
US 20536  6.1  6.1
VX  5162  1.5  1.5
WN 12275  3.6  3.6
YV   601  0.2  0.2
# Now ggplot; geometry for bar graph is "geom_bar"
ggplot(flights, aes(x = carrier)) +
  geom_bar()

# Stacked bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)
ggplot(flights, aes(x = carrier, fill = origin)) +
  geom_bar()

# Grouped bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)
ggplot(flights, aes(x = carrier, fill = origin)) +
  geom_bar(position = "dodge")

# We could also facet wrap
ggplot(flights, aes(x = carrier)) +
  geom_bar() +
  facet_wrap(~ origin, ncol = 1)

# Note how y-axis is a count by default. We could report as a proportion instead.
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..)))

# Percentage much more intuitive; the third line uses the package "scales", which 
# converts the proportion to a percentage and adds a percent sign.
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  scale_y_continuous(labels=percent) 

# Report y-axis labels as percentage; the third line uses the package "scales"; label axes
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  scale_y_continuous(labels=percent) +
  labs(x="Carrier", y="Percentage")

###################################################################################### 
################################## Scatterplots ################################## 
###################################################################################### 

# Alaska data (from moderndive package); let's view it first
alaska_flights <- alaska_flights

# Variable descriptions
#?alaska_flights

# Basic scatterplot; note how the plus sign separates different arguments.
# Note how the three elements of "grammar of graphics" are implemented with ggplot.
# The geometry name for a scatterplot is "geom_point"
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point()

# What sort of relationship do we visualize? How do we *communicate* this? 

# We can add different features, which we'll do throughout the semester. 
# Let's add more informative labels for y and x axis
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  labs(x="Departure Delay", y="Arrival Delay")

# Let's say we wanted a white background in the plot
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Transparent dots and white background
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(alpha = .2) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Make dots smaller
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(size=.5, alpha=.2) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# "Jitter" the points -- good option if there are many overlaps
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_jitter(width = 30, height = 30) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Color code by month
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay, color=factor(month))) + 
  geom_jitter(width = 30, height = 30) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Let's add a "line of best fit" to the plot
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm") +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Take out the confidence interval (shading around line)
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

# Set limits and increments for y-axis labels
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(x="Departure Delay", y="Arrival Delay") +
  scale_y_continuous(limits = c(-100, 200), breaks = seq(-100, 200, by = 50)) +
  theme_minimal()

###################################################################################### 
################################## Linegraphs   ################################## 
###################################################################################### 

# We'll use the "early_january_weather" data from moderndive package
early_january_weather <- early_january_weather
#?early_january_weather

# Let's plot temperatures ("temp") in Jan. against time ("time_hour")
# The geometry for a linegraph is "geom_line"
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line()

# One other cool graph is a "line smoother" to visualize the trend in a more effective way
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line() +
  geom_smooth(se=FALSE)

# We can change the limits on each axis; let's change the y axis
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line() +
  geom_smooth(se=FALSE) +
  ylim(0, 75) +
  labs(x="Time/hour", y="Temperature")

# We can also set the limits and the increments we want on y-axis labels
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line() +
  geom_smooth(se=FALSE) +
  labs(x="Time/hour", y="Temperature") +
  scale_y_continuous(limits = c(0, 75), breaks = seq(0, 75, by = 10))

##############################  DATA WRANGLING (Ch. 3) ##############################
# All data wrangling tools in Ch. 3 are from the package "dplyer," which is loaded
# when you load the "tidyverse" package. 

############################# 1. The Pipe Operator, |> #############################
# Allows you to link different aspects of code. "First, I'll do this, 
# THEN I'll do that, # THEN I'll do this." 
# The pipe operator is the "THEN", or the connector.

# Background on pipes. There are two pipes that are the same: 
    # 1.  %>% from tidyverse
    # 2.  |> from base R
# We will use the second one. But when you see reference to the first, realize
# they are the same.

# Keyboard shortcut: shift-command-m for Macs, shift-ctrl-m for PCs. |> 

# Change to "native pipe," or |> 
# Click on Edit -> Preferences; click "Code" on sidebar; 
# check "Use native pipe operator, |> 

# Example

# This command:

ggplot(flights, aes(x = carrier)) +
  geom_bar()

# is identical to this command:

flights |>
  ggplot(aes(x = carrier)) +
  geom_bar()

# Note the difference. "Use this data, flights, THEN produce a bar graph.

########################## 2. Filter: Select subset of observations ######################

# Let's use the "states" data that you'll use in Assignment 1

# Note that you can list all the variables in the dataset by running:
ls(states)
  [1] "Abort_rank3"             "abort_rate05"           
  [3] "abort_rate08"            "Abortion_rank12"        
  [5] "abortlaw10"              "abortlaw2017"           
  [7] "abortlaw3"               "Adv_or_more"            
  [9] "alcohol"                 "attend_pct"             
 [11] "BA_or_more"              "ba_or_more_2015"        
 [13] "battle04"                "biz_tax_rank"           
 [15] "biz_tax_score"           "black_legis_2015"       
 [17] "blackpct_2016"           "blkleg"                 
 [19] "blkpct04"                "blkpct08"               
 [21] "blkpct10"                "bush00"                 
 [23] "bush04"                  "carfatal"               
 [25] "carfatal07"              "cig_tax"                
 [27] "cig_tax_3"               "Cig_tax12"              
 [29] "Cig_tax12_3"             "cigarettes"             
 [31] "citizen_ideology"        "clinton16"              
 [33] "clinton16_ev"            "college"                
 [35] "conpct_m"                "cons_hr06"              
 [37] "cons_hr09"               "Conserv_advantage"      
 [39] "Conserv_public"          "cook_index"             
 [41] "cook_index3"             "corrections_incarc_rate"
 [43] "corrections_total_rate"  "crime_rate_burglary"    
 [45] "crime_rate_murder"       "crime_rate_property"    
 [47] "crime_rate_violent"      "deathpen_executions"    
 [49] "deathpen_exonerations"   "defexpen"               
 [51] "Dem_advantage"           "dem_hr09"               
 [53] "demHR11"                 "demnat06"               
 [55] "dempct_m"                "demstate_2017"          
 [57] "demstate06"              "demstate09"             
 [59] "demstate13"              "density"                
 [61] "division"                "drug_death_rate"        
 [63] "earmarks_pcap"           "evm"                    
 [65] "evo"                     "evo2012"                
 [67] "evr2012"                 "gay_policy"             
 [69] "gay_policy_con"          "gay_policy2"            
 [71] "gay_support"             "gay_support3"           
 [73] "gb_win00"                "gb_win04"               
 [75] "gini_2016"               "gini_rank_2016"         
 [77] "gore00"                  "Govt_worker"            
 [79] "gun_check"               "gun_dealer"             
 [81] "gun_murder10"            "gun_rank_rev"           
 [83] "Gun_rank11"              "gun_rank2015"           
 [85] "Gun_rank3"               "Gun_scale11"            
 [87] "gunlaw_rank"             "gunlaw_rank3_rev"       
 [89] "gunlaw_scale"            "gunlaw_scale2015"       
 [91] "hispanic_legis_2015"     "hispanic04"             
 [93] "hispanic08"              "hispanic10"             
 [95] "hispanicpct_2016"        "HR_cons_rank11"         
 [97] "HR_conserv11"            "HR_lib_rank11"          
 [99] "HR_liberal11"            "HS_or_more"             
[101] "hs_or_more_2015"         "hs_yrs_ss"              
[103] "indpct_m"                "infant_mortality"       
[105] "judge_selection"         "kerry04"                
[107] "legalclimate"            "legalclimate_rank"      
[109] "legis_prof_rank"         "legis_prof_score"       
[111] "lgbtq_equality_laws"     "libpct_m"               
[113] "mccain08"                "medicaid_expansion"     
[115] "min_wage"                "modpct_m"               
[117] "nader00"                 "obama_win08"            
[119] "Obama_win12"             "obama08"                
[121] "Obama2012"               "obesity_percent"        
[123] "opioid_rx_rate"          "over64"                 
[125] "over64_2016"             "permit"                 
[127] "polarization_house"      "polarization_senate"    
[129] "policy_innovation_rate"  "pop_18_24"              
[131] "pop_18_24_10"            "Pop2000"                
[133] "Pop2010"                 "Pop2010_hun_thou"       
[135] "pop2016"                 "Popchng0010"            
[137] "PopchngPct"              "Pot_policy"             
[139] "pot_policy2017"          "prcapinc"               
[141] "preg_teen_rate"          "preg_uninten_rate"      
[143] "ProChoice"               "prochoice_percent"      
[145] "ProLife"                 "reg"                    
[147] "Relig_Cath"              "Relig_high"             
[149] "relig_import"            "relig_import_2016"      
[151] "Relig_low"               "Relig_Prot"             
[153] "religiosity"             "Religiosity3"           
[155] "reppct_m"                "Romney2012"             
[157] "rtw"                     "schools_avg_salary"     
[159] "schools_spend"           "schools_st_ratio"       
[161] "secularism"              "secularism3"            
[163] "seniority_sen2"          "Smokers12"              
[165] "south"                   "state"                  
[167] "state_govt_rank"         "StateID"                
[169] "suicide_rate"            "term_limits"            
[171] "to_0004"                 "to_0408"                
[173] "TO_0812"                 "trnout00"               
[175] "trnout04"                "trump16"                
[177] "trump16_ev"              "unemploy"               
[179] "unemploy2016"            "Uninsured_pct"          
[181] "uninsured_pct_2015"      "union_2016"             
[183] "union04"                 "union07"                
[185] "union10"                 "urban"                  
[187] "vep00_turnout"           "vep04_turnout"          
[189] "vep08_turnout"           "vep12_turnout"          
[191] "vep14_turnout"           "vep16_turnout"          
[193] "volunteer_hrs_pc"        "volunteer_rate"         
[195] "voter_id_law"            "whitepct_2016"          
[197] "womleg_2007"             "womleg_2010"            
[199] "womleg_2011"             "womleg_2017"            
# Let's say we wanted to create two separate datasets: One for Southern states and
# one for non-Southern states. We can use the "filter" command

# First, let's see how "south" is coded; note 1=South, 0=non-South
freq(states$south)
              n  % val%
[0] Nonsouth 34 68   68
[1] South    16 32   32
# Create new data object for Southern states; note how we use the pipe operator
south <- states |>
  filter(south==1)

# Non-South
nonsouth <- states |>
  filter(south==0)

# Now we can describe voter turnout for Southern and non-Southern states
nonsouth |>
  ggplot(aes(y=vep16_turnout)) +
  geom_boxplot() +
  ylim(40,80)

south |>
  ggplot(aes(y=vep16_turnout)) +
  geom_boxplot() +
  ylim(40,80)

# Or, use original states data; check out labels
states |>
  ggplot(aes(x=factor(south), y=vep16_turnout)) +
  geom_boxplot() +
  ylim(40,80) +
  labs(x="non-South v. South", y="Voter Turnout 2016") +
  scale_x_discrete(labels=c("non-South", "South"))

# One huge value of the pipe is that we actually would not need to create the two 
# new data objects for South and non-South. We can integrate "subsetting" with the 
# the graphing or data command we want to execute.

# This: 
south |>
  ggplot(aes(y=vep16_turnout)) +
  geom_boxplot() +
  ylim(40,80)

# generates the exact same graph as this: 
states |>
  filter(south==1) |>
    ggplot(aes(y=vep16_turnout)) +
    geom_boxplot() +
    ylim(40,80)

########################## 2. Summarize ######################
# We can use this command to generate summary statistics, like mean, median, standard
# deviation, etc. 

states |>
  summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  61.7    6.38
# We could also save this information as an object, called a "tibble." 
stats <- states |>
  summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))

# Type "stats" to see output
stats
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  61.7    6.38
# We can also combine filter and summarize; let's use South and non-South example
states |>
  filter(south==1) |> 
    summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  58.7    5.90
states |>
  filter(south==0) |> 
  summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  63.0    6.20
# Use skimr package to get mean, sd, and more.
states |>
  filter(south==1) |> 
  skim(vep16_turnout)
Data summary
Name filter(states, south == 1…
Number of rows 16
Number of columns 200
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
vep16_turnout 0 1 58.72 5.9 50.1 52.92 59.5 64.75 67.2 ▇▂▆▂▇
states |>
  filter(south==0) |> 
  skim(vep16_turnout)
Data summary
Name filter(states, south == 0…
Number of rows 34
Number of columns 200
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
vep16_turnout 0 1 63.04 6.2 43 59.7 63.5 65.7 74.8 ▁▁▇▇▃
########################## 3. Group by ######################
# This groups observations by variable values and executes commands for each group

# Let's get avg. turnout for each of four regions in the U.S., using the "reg" variable
# 1=NE, 2=MW, 3=South, 4=West

freq(states$reg)
               n  % val%
[1] Northeast  9 18   18
[2] Midwest   12 24   24
[3] South     16 32   32
[4] West      13 26   26
region <- states |>
  group_by(reg) |>
    summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))

# See output
region
# A tibble: 4 × 3
  reg            mean std_dev
  <dbl+lbl>     <dbl>   <dbl>
1 1 [Northeast]  65.5    5.19
2 2 [Midwest]    64.3    4.79
3 3 [South]      58.7    5.90
4 4 [West]       60.1    7.14
# No need to save as region, though. You can go directly to this: 
states |>
  group_by(reg) |>
  summarize(mean = mean(vep16_turnout), std_dev = sd(vep16_turnout))
# A tibble: 4 × 3
  reg            mean std_dev
  <dbl+lbl>     <dbl>   <dbl>
1 1 [Northeast]  65.5    5.19
2 2 [Midwest]    64.3    4.79
3 3 [South]      58.7    5.90
4 4 [West]       60.1    7.14
# Could also combine group_by and skim
states |>
  group_by(reg) |>
  skim(vep16_turnout)
Data summary
Name group_by(states, reg)
Number of rows 50
Number of columns 200
_______________________
Column type frequency:
numeric 1
________________________
Group variables reg

Variable type: numeric

skim_variable reg n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
vep16_turnout 1 0 1 65.54 5.19 57.3 63.60 65.4 68.30 72.8 ▃▁▇▂▃
vep16_turnout 2 0 1 64.33 4.79 57.9 61.40 63.6 66.52 74.8 ▅▇▂▃▂
vep16_turnout 3 0 1 58.72 5.90 50.1 52.92 59.5 64.75 67.2 ▇▂▆▂▇
vep16_turnout 4 0 1 60.10 7.14 43.0 57.30 60.4 64.30 72.1 ▁▁▇▅▂
# Graph it, using geom_col()
region |> 
  ggplot(aes(x=factor(reg), y=mean)) +
  geom_col() +
  labs(x="Region", y="Percent Turnout") +
  scale_x_discrete(labels=c("NE", "MW", "South", "West"))

# Sort bars in descending order
region |> 
  ggplot(aes(x=reorder(factor(reg), -mean), y=mean)) +
  geom_col() +
  labs(x="Region", y="Percent Turnout")

region |> 
  ggplot(aes(x=reorder(factor(reg), -mean), y=mean)) +
  geom_col() +
  labs(x="Region", y="Percent Turnout") +
  scale_x_discrete(labels=c("NE", "MW", "West", "South"))

# Flip coordinates
region |> 
  ggplot(aes(x=reorder(factor(reg), mean), y=mean)) +
  geom_col() +
  labs(x="Region", y="Percent Turnout") +
  coord_flip()

# Flip coordinates
region |> 
  ggplot(aes(x=reorder(factor(reg), mean), y=mean)) +
  geom_col() +
  labs(x="Region", y="Percent Turnout") +
  coord_flip() +
  scale_x_discrete(labels=c("South", "West", "MW", "NE"))

# Flip coordinates, make bars thinner
region |> 
  ggplot(aes(x=reorder(factor(reg), mean), y=mean)) +
  geom_col(width=.5) +
  labs(x="Region", y="Percent Turnout") +
  coord_flip() +
  scale_x_discrete(labels=c("South", "West", "MW", "NE"))

# Sorted turnout by state - bars
states |> 
  ggplot(aes(x=reorder(StateID, vep16_turnout), y=vep16_turnout)) +
  geom_col(width=.5) +
  labs(x="Region", y="Percent Turnout") +
  coord_flip() 

# Sorted turnout by state - dots
states |> 
  ggplot(aes(x=reorder(StateID, vep16_turnout), y=vep16_turnout)) +
  geom_point() +
  labs(x="Region", y="Percent Turnout") +
  coord_flip() 

# Scatterplot with labels
states |> 
  ggplot(aes(x=union_2016, y=vep16_turnout)) +
  geom_point() +
  labs(x="Union", y="Percent Turnout") +
  geom_smooth(method="lm", se=FALSE) +
  geom_text(aes(label=StateID))

# Scatterplot with labels AND ggrepel
states |> 
  ggplot(aes(x=union_2016, y=vep16_turnout)) +
  geom_point() +
  labs(x="Union", y="Percent Turnout") +
  geom_smooth(method="lm", se=FALSE) +
  geom_text_repel(aes(label=StateID))

# Scatterplot with labels AND ggrepel; make labels smaller
states |> 
  ggplot(aes(x=union_2016, y=vep16_turnout)) +
  geom_point() +
  labs(x="Union", y="Percent Turnout") +
  geom_smooth(method="lm", se=FALSE) +
  geom_text_repel(aes(label=StateID), size=3)

# Scatterplot with labels AND ggrepel; make labels smaller; add % to axis labels
states |> 
  ggplot(aes(x=union_2016, y=vep16_turnout)) +
  geom_point() +
  labs(x="Union", y="Percent Turnout") +
  geom_smooth(method="lm", se=FALSE) +
  geom_text_repel(aes(label=StateID), size=3) +
  scale_y_continuous(label=percent) +
  scale_x_continuous(label=percent)

# Scatterplot with labels AND ggrepel; make labels smaller; add % to axis labels
# Note: label=percent assumes we have proportions. So we can change transform our 
# variables right in the aes(x=....)
states |> 
  ggplot(aes(x=union_2016/100, y=vep16_turnout/100)) +
  geom_point() +
  labs(x="Union", y="Percent Turnout") +
  geom_smooth(method="lm", se=FALSE) +
  geom_text_repel(aes(label=StateID), size=3) +
  scale_y_continuous(label=percent) +
  scale_x_continuous(label=percent)

########################## 4. Create new variables using "mutate" ###########

# Let's say we wanted to change variable name for turnout and save in "st" object
# NEVER override existing data. Instead, create new variable.
states <- states |> 
  mutate(turnout=vep16_turnout)

# Let's say we need to recode a variable. Again, don't override existing var. 
# Create new one
# NES example from Assignment 1

freq(nes$V201228)
                                                 n    % val%
[-9] -9. Refused                                44  0.5  0.5
[-8] -8. Don't know                              4  0.0  0.0
[-4] -4. Technical error                         1  0.0  0.0
[0] 0. No preference {VOL - video/phone only}    7  0.1  0.1
[1] 1. Democrat                               2865 34.6 34.6
[2] 2. Republican                             2563 31.0 31.0
[3] 3. Independent                            2527 30.5 30.5
[5] 5. Other party {SPECIFY}                   269  3.2  3.2
# Let's create a new variable with more intuitive name, like pid3
nes <- nes |> 
  mutate(pid3=V201228)

freq(nes$pid3)
                                                 n    % val%
[-9] -9. Refused                                44  0.5  0.5
[-8] -8. Don't know                              4  0.0  0.0
[-4] -4. Technical error                         1  0.0  0.0
[0] 0. No preference {VOL - video/phone only}    7  0.1  0.1
[1] 1. Democrat                               2865 34.6 34.6
[2] 2. Republican                             2563 31.0 31.0
[3] 3. Independent                            2527 30.5 30.5
[5] 5. Other party {SPECIFY}                   269  3.2  3.2
# Sidenote: I can use group_by, summarize, and mutate to generate a freq distribution:
nes |> group_by(pid3) |> summarize(n=n())
# A tibble: 8 × 2
  pid3                                               n
  <dbl+lbl>                                      <int>
1 -9 [-9. Refused]                                  44
2 -8 [-8. Don't know]                                4
3 -4 [-4. Technical error]                           1
4  0 [0. No preference {VOL - video/phone only}]     7
5  1 [1. Democrat]                                2865
6  2 [2. Republican]                              2563
7  3 [3. Independent]                             2527
8  5 [5. Other party {SPECIFY}]                    269
nes |> group_by(pid3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 8 × 3
  pid3                                               n     pct
  <dbl+lbl>                                      <int>   <dbl>
1 -9 [-9. Refused]                                  44  0.531 
2 -8 [-8. Don't know]                                4  0.0483
3 -4 [-4. Technical error]                           1  0.0121
4  0 [0. No preference {VOL - video/phone only}]     7  0.0845
5  1 [1. Democrat]                                2865 34.6   
6  2 [2. Republican]                              2563 31.0   
7  3 [3. Independent]                             2527 30.5   
8  5 [5. Other party {SPECIFY}]                    269  3.25  
# Change anything but Dem, Rep, Indep to missing data (NA) and reverse Ind and Rep
# Create value labels
nes <- nes |> 
  mutate(pid3 = ifelse(V201228<1 | V201228>4, NA, V201228), 
         pid3 = ifelse(V201228==2, 3, pid3), 
         pid3 = ifelse(V201228==3, 2, pid3), 
         pid3=factor(pid3, 
                     labels=c("Democrat", "Independent", "Republican"))
         ) 

nes <- nes |> mutate(pid3_alt=V201228, 
                     pid3_alt = ifelse(V201228<1 | V201228>4, NA, V201228)
)


nes <- nes |> 
  mutate(pid3_alt=as_factor(V201228), 
         pid3_alt = ifelse(V201228<1 | V201228>4, NA, V201228), 
         pid3_alt = ifelse(V201228==2, 3, pid3_alt), 
         pid3_alt = ifelse(V201228==3, 2, pid3_alt)
        )

nes |> group_by(pid3_alt) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 4 × 3
  pid3_alt     n   pct
     <dbl> <int> <dbl>
1        1  2865 34.6 
2        2  2527 30.5 
3        3  2563 31.0 
4       NA   325  3.93
nes |>
  filter(!is.na(pid3_alt)) |> 
  ggplot(aes(x=pid3_alt)) +
  geom_bar()

nes |>
  filter(!is.na(pid3)) |> 
  ggplot(aes(x=pid3)) +
  geom_bar()

nes |>
  filter(!is.na(pid3)) |> 
  ggplot(aes(x=pid3)) +
  geom_bar(aes(y = (after_stat(count))/sum(after_stat(count))), width=.5) +
  scale_y_continuous(labels=percent) +
  labs(x="Party Identification", y="Percent")

nes |> 
  ggplot(aes(x=factor(pid3))) +
  geom_bar() +
  scale_x_discrete(labels=c("Dem", "Ind", "Rep"))

nes |> 
  filter(!is.na(pid3)) |> 
  ggplot(aes(x=factor(pid3))) +
  geom_bar() +
  scale_x_discrete(labels=c("Dem", "Ind", "Rep"))

nes |> 
  filter(!is.na(pid3)) |> 
  ggplot(aes(x=factor(pid3))) +
  geom_bar(width=.5) +
  scale_x_discrete(labels=c("Dem", "Ind", "Rep"))

# Report pct and add appropriate labels

########################## 5. Select: subset variables ######################

# Let's say we're working with the states data, but we want to subset JUST the variables
# that we're analyzing. From Assignment 1, we use "vep16_turnout" and "ba_or_more_2015." 
# Let's use "state" and "StateID" as well.

st_reduced <- states |>
  select(state, StateID, vep16_turnout, ba_or_more_2015)

# You can also rename variables, create more intuitive var names
st_reduced <- states |>
  select(state, StateID, vep16_turnout, ba_or_more_2015) |>
  mutate(turnout=vep16_turnout, ba=ba_or_more_2015)